I started watching David Robinson’s Tidy Tuesday screencasts to learn more about data wrangling and visualization (exploratory data analysis). I also do research in teaching and learning languages, so this was a great excuse to port his R code to Python and note differences between the two.
An analysis of the 538 “College Major and Income” dataset from the #tidytuesday project.
David’s code
library(tidyverse)
library(scales)
library(ggrepel)
library(plotly)
theme_set(theme_light())
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import plotly as plotly
import plotly.graph_objects as go
from siuba.dply.forcats import fct_lump
pd.set_option('display.max_columns', 500)
recent_grads <- read_csv("recent-grads.csv")
head(recent_grads)
## # A tibble: 6 x 21
## Rank Major_code Major Total Men Women Major_category ShareWomen Sample_size
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 1 2419 PETR… 2339 2057 282 Engineering 0.121 36
## 2 2 2416 MINI… 756 679 77 Engineering 0.102 7
## 3 3 2415 META… 856 725 131 Engineering 0.153 3
## 4 4 2417 NAVA… 1258 1123 135 Engineering 0.107 16
## 5 5 2405 CHEM… 32260 21239 11021 Engineering 0.342 289
## 6 6 2418 NUCL… 2573 2200 373 Engineering 0.145 17
## # … with 12 more variables: Employed <dbl>, Full_time <dbl>, Part_time <dbl>,
## # Full_time_year_round <dbl>, Unemployed <dbl>, Unemployment_rate <dbl>,
## # Median <dbl>, P25th <dbl>, P75th <dbl>, College_jobs <dbl>,
## # Non_college_jobs <dbl>, Low_wage_jobs <dbl>
recent_grads = pd.read_csv("recent-grads.csv") # side note: entering direct raw github url resulted in some parser error.
recent_grads.head()
## Rank Major_code Major Total \
## 0 1 2419 PETROLEUM ENGINEERING 2339.0
## 1 2 2416 MINING AND MINERAL ENGINEERING 756.0
## 2 3 2415 METALLURGICAL ENGINEERING 856.0
## 3 4 2417 NAVAL ARCHITECTURE AND MARINE ENGINEERING 1258.0
## 4 5 2405 CHEMICAL ENGINEERING 32260.0
##
## Men Women Major_category ShareWomen Sample_size Employed \
## 0 2057.0 282.0 Engineering 0.120564 36 1976
## 1 679.0 77.0 Engineering 0.101852 7 640
## 2 725.0 131.0 Engineering 0.153037 3 648
## 3 1123.0 135.0 Engineering 0.107313 16 758
## 4 21239.0 11021.0 Engineering 0.341631 289 25694
##
## Full_time Part_time Full_time_year_round Unemployed Unemployment_rate \
## 0 1849 270 1207 37 0.018381
## 1 556 170 388 85 0.117241
## 2 558 133 340 16 0.024096
## 3 1069 150 692 40 0.050125
## 4 23170 5180 16697 1672 0.061098
##
## Median P25th P75th College_jobs Non_college_jobs Low_wage_jobs
## 0 110000 95000 125000 1534 364 193
## 1 75000 55000 90000 350 257 50
## 2 73000 50000 105000 456 176 0
## 3 70000 43000 80000 529 102 0
## 4 65000 50000 75000 18314 4440 972
# cleaning step
major_processed <- recent_grads %>%
arrange(desc(Median)) %>%
# change all caps to words with first letter capitalized
mutate(Major = str_to_title(Major),
Major = fct_reorder(Major, Median))
head(major_processed[c('Major', 'Median')], 15)
## # A tibble: 15 x 2
## Major Median
## <fct> <dbl>
## 1 Petroleum Engineering 110000
## 2 Mining And Mineral Engineering 75000
## 3 Metallurgical Engineering 73000
## 4 Naval Architecture And Marine Engineering 70000
## 5 Chemical Engineering 65000
## 6 Nuclear Engineering 65000
## 7 Actuarial Science 62000
## 8 Astronomy And Astrophysics 62000
## 9 Mechanical Engineering 60000
## 10 Electrical Engineering 60000
## 11 Computer Engineering 60000
## 12 Aerospace Engineering 60000
## 13 Biomedical Engineering 60000
## 14 Materials Science 60000
## 15 Engineering Mechanics Physics And Science 58000
import string
major_processed = recent_grads.sort_values(by='Median', ascending=False)
# mutate(Major = str_to_title(Major), -> use pandas apply w. string.capwords
# Major = fct_reorder(Major, Median)) -> just subset w. [ using -Median and index with the sorted indices
major_processed['Major'] = major_processed['Major'].apply(string.capwords)[(-major_processed['Median']).argsort()]
# Gotcha: argsort() will mess up row index order for tied Median, even setting the kind param of argsort didn't work
major_processed.sort_index(inplace = True)
major_processed[['Major', 'Median']].head(15)
## Major Median
## 0 Petroleum Engineering 110000
## 1 Mining And Mineral Engineering 75000
## 2 Metallurgical Engineering 73000
## 3 Naval Architecture And Marine Engineering 70000
## 4 Chemical Engineering 65000
## 5 Nuclear Engineering 65000
## 6 Actuarial Science 62000
## 7 Astronomy And Astrophysics 62000
## 8 Mechanical Engineering 60000
## 9 Electrical Engineering 60000
## 10 Computer Engineering 60000
## 11 Aerospace Engineering 60000
## 12 Biomedical Engineering 60000
## 13 Materials Science 60000
## 14 Engineering Mechanics Physics And Science 58000
# Common pattern: group_by -> summarize -> arrange
by_major_category <- major_processed %>%
filter(!is.na(Total)) %>%
group_by(Major_category) %>%
# summarize is nice when you need to apply different functions to columns
summarize(Men = sum(Men),
Women = sum(Women),
Total = sum(Total),
# weight the medians by sample size
MedianSalary = sum(Median * Sample_size) / sum(Sample_size)) %>%
mutate(ShareWomen = Women / Total) %>%
arrange(desc(ShareWomen))
by_major_category
## # A tibble: 16 x 6
## Major_category Men Women Total MedianSalary ShareWomen
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Health 75517 387713 4.63e5 43694. 0.837
## 2 Education 103526 455603 5.59e5 32364. 0.815
## 3 Psychology & Social Work 98115 382892 4.81e5 31234. 0.796
## 4 Interdisciplinary 2817 9479 1.23e4 35000 0.771
## 5 Communications & Journalism 131921 260680 3.93e5 34738. 0.664
## 6 Arts 134390 222740 3.57e5 32046. 0.624
## 7 Humanities & Liberal Arts 272846 440622 7.13e5 32192. 0.618
## 8 Biology & Life Science 184919 268943 4.54e5 34379. 0.593
## 9 Industrial Arts & Consumer Serv… 103781 126011 2.30e5 34186. 0.548
## 10 Social Science 256834 273132 5.30e5 39200. 0.515
## 11 Law & Public Policy 91129 87978 1.79e5 35635. 0.491
## 12 Business 667852 634524 1.30e6 40890. 0.487
## 13 Physical Sciences 95390 90089 1.85e5 38477. 0.486
## 14 Agriculture & Natural Resources 40357 35263 7.56e4 35586. 0.466
## 15 Computers & Mathematics 208725 90283 2.99e5 46993. 0.302
## 16 Engineering 408307 129276 5.38e5 55916. 0.240
# R -> pandas challenge (help!):
# There might be a better way to do this, but can't figure out how MedianSalary and ShareWomen can be
# declared within the agg() like you could with R's summerise
# Workaround (which is actually fewer lines):
# For now, settling with breaking things up into statements
by_major_category_grp = major_processed[major_processed.Total.notnull()].groupby('Major_category')
# Do the groupby / sum on the Men, Women, Total
by_major_category = by_major_category_grp['Men', 'Women', 'Total'].sum()
# Before creating the MedianSalary column.
by_major_category['MedianSalary'] = by_major_category_grp.apply(lambda x: sum(x.Median * x.Sample_size) / sum(x.Sample_size))
# Transform ShareWomen
by_major_category['ShareWomen'] = by_major_category['Women'] / by_major_category['Total']
by_major_category.sort_values('ShareWomen', ascending=False, inplace=True)
by_major_category
## Men Women Total \
## Major_category
## Health 75517.0 387713.0 463230.0
## Education 103526.0 455603.0 559129.0
## Psychology & Social Work 98115.0 382892.0 481007.0
## Interdisciplinary 2817.0 9479.0 12296.0
## Communications & Journalism 131921.0 260680.0 392601.0
## Arts 134390.0 222740.0 357130.0
## Humanities & Liberal Arts 272846.0 440622.0 713468.0
## Biology & Life Science 184919.0 268943.0 453862.0
## Industrial Arts & Consumer Services 103781.0 126011.0 229792.0
## Social Science 256834.0 273132.0 529966.0
## Law & Public Policy 91129.0 87978.0 179107.0
## Business 667852.0 634524.0 1302376.0
## Physical Sciences 95390.0 90089.0 185479.0
## Agriculture & Natural Resources 40357.0 35263.0 75620.0
## Computers & Mathematics 208725.0 90283.0 299008.0
## Engineering 408307.0 129276.0 537583.0
##
## MedianSalary ShareWomen
## Major_category
## Health 43693.740419 0.836977
## Education 32363.517503 0.814844
## Psychology & Social Work 31234.402516 0.796022
## Interdisciplinary 35000.000000 0.770901
## Communications & Journalism 34738.243123 0.663982
## Arts 32045.858896 0.623694
## Humanities & Liberal Arts 32192.322097 0.617578
## Biology & Life Science 34378.549849 0.592566
## Industrial Arts & Consumer Services 34185.588915 0.548370
## Social Science 39200.371098 0.515376
## Law & Public Policy 35635.142119 0.491204
## Business 40889.841986 0.487205
## Physical Sciences 38477.396658 0.485710
## Agriculture & Natural Resources 35586.142322 0.466318
## Computers & Mathematics 46993.426573 0.301942
## Engineering 55915.854649 0.240476
# visualize the distribution of salary for each major using boxplot
major_processed %>%
# reorder Major_category according to the Median (top -> highest salary)
mutate(Major_category = fct_reorder(Major_category, Median)) %>%
# Major_category as X, Median as y
ggplot(aes(Major_category, Median, fill = Major_category)) +
geom_boxplot() +
# reformats the Median (y) as currency in $
scale_y_continuous(labels = dollar_format()) +
# since it's hard to read the labels for x axis, flip coords
expand_limits(y = 0) +
coord_flip() +
theme(legend.position = "none")
clear_plots_axes()
major_processed_dist_salary = major_processed.copy()
# reorder Major_category according to the Median (top -> highest salary)
# I'm sure there's a better way, but one way to replicate `fct_reorder` is to do a groupby -> agg -> sort_values
cat_order = major_processed_dist_salary.groupby('Major_category').agg('median').sort_values('Median', ascending=False).index.values.tolist()
# seaborn/matplotlib why are you like this? are there better ways?
_ = plt.xlim(0, major_processed_dist_salary['Median'].max() + 1000)
ax = sns.boxplot(
x = 'Median',
y = 'Major_category',
data = major_processed_dist_salary,
order = cat_order)
# limitations: there isn't an equivalent expand_limits so I had to use xlim and set the proper upper bound to include
# the Engineering outlier
_ = plt.xlim(0, major_processed_dist_salary['Median'].max() + 1000)
# put $ in front of Median x-axis tick labels
_ = ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('${x:,.0f}'))
ax.grid(True)
plt.tight_layout()
plt.show()
major_processed %>%
filter(Sample_size >= 100) %>%
head(20) %>%
ggplot(aes(Major, Median, color=Major_category)) +
geom_point() +
# show intervals (range of salaries)
geom_errorbar(aes(ymin = P25th, ymax = P75th)) +
# geom_point doesn't start at 0 whereas geom_col does
# so need to expand scale to start from 0
expand_limits(y = 0) +
scale_y_continuous(labels = dollar_format()) +
coord_flip() +
labs(title = "What are the highest earning majors?",
subtitle = "Top 20 majors with at least a 100 graduates surveyed. Bars represent the 25th to 75th percentile.",
x = "",
y = "Median salary of graduates")
clear_plots_axes()
sns.set_style('whitegrid')
major_processed_highest_earning = major_processed.copy().query('Sample_size >= 100').head(20)
# problem: the majors on the x-axis is sorted differently in Python and the error bars don't look right
ax = sns.pointplot(
y = 'Major',
x = 'Median',
hue = 'Major_category',
data = major_processed_highest_earning,
dodge = False,
join = False)
# point plot with expanded x-axis to include 0
_ = plt.xlim(0, major_processed_dist_salary['Median'].max() + 1000)
# put $ in front of Median x-axis tick labels
_ = ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('${x:,.0f}'))
# set the legend outside of the plot, slightly above the plot
_ = plt.legend(bbox_to_anchor = (1.8, .0), loc = 'lower center', borderaxespad = 4.)
# get the lower and upper 25th/75th percentile for error bar (not sure if this is the way) did the *.25 to make bars smaller
lower_quartile = major_processed_highest_earning.groupby('Major')['Median'].apply(np.percentile, 25).values*.25
upper_quartile = major_processed_highest_earning.groupby('Major')['Median'].apply(np.percentile, 75).values*.25
quartiles = [lower_quartile, upper_quartile]
# add error bar to each major
# limitation: couldn't figure out how to make color be the hue (Major_category)! :(
# possible solution: https://stackoverflow.com/a/21915157
# another discrepancy: some of the
_ = plt.errorbar(
major_processed_highest_earning.Median,
major_processed_highest_earning.Major,
xerr = quartiles, capsize = 6, elinewidth = 2, linewidth = 2, fmt = 'none', ecolor='lightgrey')
# title (https://stackoverflow.com/a/52937244)
_ = ax.text(x = 0.5, y = 1.1, s = 'What are the highest earning majors?', fontsize = 20, weight = 'bold', ha = 'center', va = 'bottom', transform = ax.transAxes)
# subtitle
_ = ax.text(x = 0.5, y = 1.05, s = 'Top 20 majors with at least a 100 graduates surveyed. Bars represent the 25th to 75th percentile.', fontsize = 12, alpha = 0.75, ha = 'center', va = 'bottom', transform = ax.transAxes)
# plt.tight_layout()
plt.show()
major_processed %>%
arrange(desc(Total)) %>%
head(20) %>%
mutate(Major = fct_reorder(Major, Total)) %>%
# `gather` collapses the two columns (Women, Men) into a single Gender and value is Number
gather(Gender, Number, Women, Men) %>%
ggplot(aes(Major, Number, fill = Gender)) +
scale_y_continuous(labels = comma_format()) +
geom_col() +
coord_flip()
sns.set_style("whitegrid")
major_processed_gender = major_processed.copy()
major_processed_gender['total'] = major_processed_gender.Men + major_processed_gender.Women
major_processed_gender = major_processed_gender.sort_values(by='Total', ascending = False).head(20)
cat_order = major_processed_gender.groupby('Major').agg('median').sort_values('Total', ascending=False).index.values.tolist()
_ = plt.xlim(0, major_processed_gender['total'].max() + 10000, 100000)
_ = ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
# this is a simple way to do stacked bar chart: barplot with total first
ax = sns.barplot(y = 'Major', x = 'total', data = major_processed_gender, order = cat_order, label = 'Men', color='salmon')
# then barplot with overlaying variable on top
ax = sns.barplot(y = 'Major', x = 'Women', label = 'Women', data = major_processed_gender, order = cat_order, color='cyan')
# need to set legend and label since we didn't make use of the `hue` parameter to do our fill
_ = ax.legend(ncol = 2, frameon = False, fontsize = 'large')
_ = ax.set(xlabel="Number")
plt.tight_layout()
plt.show()
major_processed %>%
filter(Sample_size >= 100) %>%
ggplot(aes(Sample_size, Median)) +
geom_point() +
geom_text(aes(label = Major), check_overlap = TRUE, vjust = 1, hjust = 1) +
scale_x_log10()